import os
import sys
import gzip
from collections import defaultdict
import pybedtools
from Bio import SeqIO


target = sys.argv[1]

assembly = "hg38"

def get_transcript_list(target):
    transcripts = []
    directory = "/osc-fs_home/mdehoon/Data/CASPARs/Filters"
    filename = "%s.psl" % target
    path = os.path.join(directory, filename)
    print("Reading", path)
    stream = open(path)
    for line in stream:
        words = line.split()
        assert len(words) == 21
        transcript = words[9]
        transcripts.append(transcript)
    stream.close()
    n = len(transcripts)
    transcripts = set(transcripts)
    assert n == len(transcripts)
    return transcripts

def parse_refseq_genbank():
    directory = "/osc-fs_home/scratch/mdehoon/Data/NCBI/refseq"
    filenames = []
    for filename in os.listdir(directory):
        terms = filename.split('.')
        if len(terms) != 5:
            continue
        if terms[0] != 'human':
            continue
        if terms[2] != 'rna':
            continue
        if terms[3] != 'gbff':
            continue
        if terms[4] != 'gz':
            continue
        filenames.append(filename)
    filenames.sort(key=lambda filename: int(filename.split('.')[1]))
    records = []
    for filename in filenames:
        path = os.path.join(directory, filename)
        print("Reading", path)
        handle = gzip.open(path, 'rt')
        for record in SeqIO.parse(handle, 'genbank'):
            yield record
        handle.close()

def write_gff(genes):
    filename = "%s.psl" % target
    directory = "/osc-fs_home/mdehoon/Data/CASPARs/Filters"
    path = os.path.join(directory, filename)
    print("Reading", path)
    stream = open(path)
    filename = "%s.gff" % target
    print("Writing", filename)
    output = open(filename, "w")
    for line in stream:
        words = line.split()
        assert len(words) == 21
        qName = words[9]
        tName = words[13]
        tStart = int(words[15])
        tEnd = int(words[16])
        strand = words[8]
        gene = genes.get(qName)
        if gene is None:
            continue
        attrs = "transcript=%s;gene=%s" % (qName, gene)
        fields = [tName, "transcript", target, tStart + 1, tEnd, ".", strand, ".", attrs]
        interval = pybedtools.create_interval_from_list(fields)
        output.write(str(interval))
    stream.close()
    output.close()

transcripts = get_transcript_list(target)

genes = {}
records = parse_refseq_genbank()
for record in records:
    transcript = record.id
    if transcript not in transcripts:
        continue
    for feature in record.features:      
        if feature.type == 'gene':
            break
    else:
        raise Exception("Failed to find gene for %s" % transcript)
    assert len(feature.qualifiers['gene']) == 1
    gene = feature.qualifiers['gene'][0]
    genes[transcript] = gene

write_gff(genes)
